Load Data

lfq <- read.csv('./data/nonimputed_lfq.csv')
KO.22 KO.23 KO.24 WT.22 WT.23 WT.24
NA NA NA NA NA NA
NA NA NA NA NA NA
NA NA NA NA NA NA
NA NA NA NA NA NA
NA NA 23.19647 NA 22.00220 NA
23.88101 23.58256 26.41355 24.20970 23.28630 23.16521
25.29155 25.42259 24.77465 24.84105 25.97572 25.31251
NA NA 24.25638 NA NA NA
23.25265 22.10623 25.05821 24.50246 22.65325 23.18450
NA NA NA NA NA NA

Data Preprocessing

LFQ_KO <- lfq %>%
    select(contains('KO'))

LFQ_WT <- lfq %>%
    select(contains('WT'))

Data Mining

ggarrange(
    plothist(LFQ_KO, '- KO'),
    plothist(LFQ_WT, '- WT'),
    nrow=2,ncol=1
)

# skewness of nonimputed data
moments::skewness(lfq, na.rm = TRUE)
##     KO.22     KO.23     KO.24     WT.22     WT.23     WT.24 
## 0.6754195 0.6725489 0.5218363 0.7211065 0.6383786 0.7043256
# skewnes of imputed data
lfq.imp <- read.csv('./data/LFQ_raw_totals_imp.csv')
moments::skewness(lfq.imp)
## KO_TOTALS_22 KO_TOTALS_23 KO_TOTALS_24 WT_TOTALS_22 WT_TOTALS_23 WT_TOTALS_24 
##    0.5761698    0.5868388    0.5206581    0.6285968    0.5813767    0.6579983

The imputation doesn’t have influence on skewness of data. 🕺 Furthermore the imputation fixed it in some level, because the skewness coeff is lower for imputed data.

protti library

library(protti)

Data Preprocessing

Thanks to Mateusz, we had data almost ready for analysis with protti, but there were some important columns missing. So, based on Matis’ code, I transformed and selected features based on The Input Preparation Workflow from the protti documentation.

proteingroups <- readr::read_tsv("data/proteinGroups.txt", show_col_types = FALSE)

proteingroups <- proteingroups %>% filter(is.na(`Only identified by site`),
                         is.na(Reverse),
                         is.na(`Potential contaminant`))

proteingroups <- proteingroups %>%
    select(`Protein IDs`, `Peptide sequences`, 
           contains('LFQ intensity') & contains('TOTALS') & (ends_with('22') | ends_with('23') | ends_with('24'))) %>%
    mutate(`Protein IDs`= paste('prot_', 1:nrow(proteingroups), sep='')) %>%
    pivot_longer(3:8, names_to = 'Sample', values_to = 'Intensity')%>%
    mutate(Sample = gsub('LFQ intensity ', '', Sample)) %>%
    separate(col =  Sample, into = c("celltype","sampletype","rep"), sep = "_", remove = F) %>%
    mutate(Condition = ifelse(celltype == 'KO', 'treated', 'control'),
           Intensity = ifelse(Intensity == 0, NA, log2(Intensity))) %>%
    select(Sample, `Protein IDs`, `Peptide sequences`, Condition, Intensity) 
Sample Protein IDs Peptide sequences Condition Intensity
KO_TOTALS_22 prot_1 SEQEDEVLLVSSSR;VLQCHKPVHAEYLEK treated NA
KO_TOTALS_23 prot_1 SEQEDEVLLVSSSR;VLQCHKPVHAEYLEK treated NA
KO_TOTALS_24 prot_1 SEQEDEVLLVSSSR;VLQCHKPVHAEYLEK treated NA
WT_TOTALS_22 prot_1 SEQEDEVLLVSSSR;VLQCHKPVHAEYLEK control NA
WT_TOTALS_23 prot_1 SEQEDEVLLVSSSR;VLQCHKPVHAEYLEK control NA
WT_TOTALS_24 prot_1 SEQEDEVLLVSSSR;VLQCHKPVHAEYLEK control NA


We can do most of the calculations from the protti library with data structured as above.

Normalization

The protti library gives us fxn to normalise data, but the function has only median method which is the original intensity minus the run median plus the global median.

normalised_data <- proteingroups %>%
    protti::normalise(sample = Sample,
              intensity_log2 = Intensity,
              method = 'median')

ggplot(data=normalised_data, mapping=aes(x=Sample, y = normalised_intensity_log2, fill=Sample)) + 
    geom_boxplot(width=0.2)+
    geom_violin(alpha=0.4)+
    theme(legend.position = 'none') +
    labs(title='Violin plot of Normalised LFQ Intensity', y=' Normalised LFQ Values', x='')

normalised_data %>%
    group_by(Sample) %>%
    summarise(skewness = moments::skewness(normalised_intensity_log2, na.rm=TRUE))
## # A tibble: 6 × 2
##   Sample       skewness
##   <chr>           <dbl>
## 1 KO_TOTALS_22    0.675
## 2 KO_TOTALS_23    0.673
## 3 KO_TOTALS_24    0.522
## 4 WT_TOTALS_22    0.721
## 5 WT_TOTALS_23    0.638
## 6 WT_TOTALS_24    0.704

Imputation

The library gives us 2 methods for imputation:

  1. method = "ludovic", MNAR missingness is sampled from a normal distribution around a value that is three lower (log2) than the lowest intensity value recorded for the precursor/peptide and that has a spread of the mean standard deviation for the precursor/peptide.
  2. method = "noise", MNAR missingness is sampled from a normal distribution around the mean noise for the precursor/peptide and that has a spread of the mean standard deviation (from each condition) for the precursor/peptide.
# Assigning missing rows
data_missing <- normalised_data %>%
    assign_missingness(sample=Sample,
                       condition = Condition,
                       grouping = `Protein IDs`,
                       intensity = normalised_intensity_log2,
                       ref_condition = 'all')

Method ludovic

ludovic_imp <- impute(
    data_missing,
    sample = Sample,
    grouping = `Protein IDs`,
    intensity_log2 = normalised_intensity_log2,
    condition = Condition,
    comparison = comparison,
    missingness = missingness,
    method = 'ludovic',
    skip_log2_transform_error = TRUE
)

The imputation is done, but we have too much missing data in the column and the fxn can’t impute a value there. The conclusion is that we still have a lot of missing values for peptides that didn’t have an LFQ value.

Method noise

noise_imp <- impute(
    data_missing,
    sample = Sample,
    grouping = `Protein IDs`,
    intensity_log2 = normalised_intensity_log2,
    condition = Condition,
    comparison = comparison,
    missingness = missingness,
    method = 'noise',
    skip_log2_transform_error = TRUE
)

Visualizations

df_to_skew <- data.frame(ludovic_imp$Sample,ludovic_imp$imputed_intensity, noise_imp$imputed_intensity)
colnames(df_to_skew) <- c('sample', 'ludovic', 'noise')

df_to_skew %>%
    group_by(sample) %>%
    summarise(skewness_ludovic = moments::skewness(ludovic, na.rm=TRUE),
              skewness_noise = moments::skewness(noise, na.rm=TRUE))
## # A tibble: 6 × 3
##   sample       skewness_ludovic skewness_noise
##   <chr>                   <dbl>          <dbl>
## 1 KO_TOTALS_22            0.611          0.680
## 2 KO_TOTALS_23            0.610          0.682
## 3 KO_TOTALS_24            0.483          0.557
## 4 WT_TOTALS_22            0.540          0.726
## 5 WT_TOTALS_23            0.488          0.645
## 6 WT_TOTALS_24            0.533          0.711
moments::skewness(df_to_skew[,2:3], na.rm=TRUE)
##   ludovic     noise 
## 0.5458434 0.6711414

Ludovic imputation method has less skewness coefficient. Important is that both imputation methods decreases the skewness coefficient.

Statistics

DIFFERENTIAL ABUNDANCE AND SIGNIFICANCE

In the tutorial, they calculating Differential Abundance and Significance using data prepared like above. We can run calculations on missing or imputed data. Like in tutorial we are using the moderate t-test to calculate this statistic.

All methods:

  1. t-test - Welch test.
  2. t-test_mean_sd - Welch test on means, standard deviations and number of replicates.
  3. moderated_t-test - a moderated t-test based on the limma package
  4. proDA - can be used to infer means across samples based on a probabilistic dropout model. This eliminates the need for data imputation since missing values are inferred from the model.
result <- ludovic_imp %>%
    calculate_diff_abundance(sample=Sample,
                             condition = Condition,
                             grouping = `Protein IDs`,
                             intensity_log2 = imputed_intensity,
                             missingness = missingness,
                             comparison = comparison,
                             filter_NA_missingness = TRUE,
                             method = 'moderated_t-test')

The result can be visualize with the volcano plot, which can be also interactive.

result %>%
    volcano_plot(grouping = `Protein IDs`,
                 log2FC = diff,
                 significance = pval,
                 method = 'significant',
                 significance_cutoff = c(0.05, "adj_pval"),
                 interactive = TRUE)

CORELATION OF VARIATION

cv <- function(x) (sd(x) / mean(x)) * 100

# extract data with values for all samples
all_imp <- as.data.frame(ludovic_imp)[which(ludovic_imp$`Protein IDs` %in% result$`Protein IDs`),]

lfq_wider <- all_imp %>%
    select(Sample, imputed_intensity) %>%
    pivot_wider(names_from = Sample, values_from = imputed_intensity, values_fn = list) %>%
    unnest(cols = c(KO_TOTALS_22, KO_TOTALS_23, KO_TOTALS_24, WT_TOTALS_22, WT_TOTALS_23, WT_TOTALS_24))

cv_imp <- apply(lfq_wider, 1, cv)

plotoneviolin(cv_imp, xlab='Non-Imputed')

mean median sd
2.618871 2.28552 1.547142

Interesting Observation

When we run code bellow, we have data frame with 5138 rows.

lfq_long <- read.csv('./data/LFQ_long_raw.csv')


lfq_long %>%
    select(Sample) %>%
    filter(grepl('TOTALS', Sample) & (grepl('22', Sample) | grepl('23', Sample) | grepl('24', Sample)))%>%
    group_by(Sample) %>%
    summarise(test = length(Sample))
## # A tibble: 6 × 2
##   Sample        test
##   <chr>        <int>
## 1 KO_TOTALS_22  5138
## 2 KO_TOTALS_23  5138
## 3 KO_TOTALS_24  5138
## 4 WT_TOTALS_22  5138
## 5 WT_TOTALS_23  5138
## 6 WT_TOTALS_24  5138

But, when you read the original data from proteingroups.txt and make some basic filtration we have 5905 rows.